Cargamos los paquetes necesarios
library(dplyr)
library(sf)
library(gstat)
library(geoR)
library(spdep)
library(DataExplorer)
library(psych)
library(ggcorrplot)
library(biogeo)
library(ggplot2)
library(leaflet)
Cargamos los datasets
estaciones <- read.csv("data_smn/preprocessed/estaciones_smn_v2.csv")
horarios <- read.csv("data_smn/preprocessed/datohorario20210420.csv")
Observamos como esta compuesto el dataset
glimpse(estaciones)
Rows: 123
Columns: 9
$ NOMBRE <chr> "BASE BELGRANO II", "BASE CARLINI (EX JUBANY)", "BASE ESPERANZA", "BASE MARAMBIO", "BASE ORC…
$ PROVINCIA <chr> "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "BUENOS AIRES"…
$ LATITUD_GRADOS <int> 77, 62, 63, 64, 60, 68, 36, 38, 37, 36, 34, 38, 37, 36, 34, 34, 34, 34, 36, 37, 34, 34, 34, …
$ LATITUD_MINUTOS <int> 52, 14, 24, 14, 45, 8, 50, 44, 43, 12, 32, 1, 26, 21, 36, 49, 33, 58, 2, 56, 33, 41, 40, 27,…
$ LONGITUD_GRADOS <int> 34, 58, 56, 56, 44, 67, 59, 62, 59, 61, 58, 61, 61, 57, 58, 58, 60, 57, 59, 57, 58, 58, 58, …
$ LONGITUD_MINUTOS <int> 34, 38, 59, 43, 43, 8, 53, 10, 47, 4, 40, 20, 53, 44, 36, 32, 55, 54, 8, 35, 49, 44, 38, 53,…
$ ALTURA <int> 256, 11, 24, 198, 12, 7, 147, 83, 207, 94, 26, 247, 233, 9, 12, 20, 81, 23, 36, 21, 32, 36, …
$ NRO <int> 89034, 89053, 88963, 89055, 88968, 89066, 87641, 87750, 87649, 87640, 87570, 87683, 87637, 8…
$ NroOACI <chr> "SAYB", "SAYJ", "SAYE", "SAWB", "SAYO", "SAYS", "SAZA", "SAZB", "SAZJ", "SAZI", "SADO", "SAB…
print("#--------------------#")
[1] "#--------------------#"
glimpse(horarios)
Rows: 2,041
Columns: 8
$ FECHA <int> 20042021, 20042021, 20042021, 20042021, 20042021, 20042021, 20042021, 20042021, 20042021, 20042021, 20…
$ HORA <int> 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5…
$ TEMP <dbl> 20.7, 19.8, 19.5, 19.2, 19.2, 18.9, 19.0, 18.8, 19.0, 19.1, 19.9, 20.3, 21.8, 22.9, 23.2, 23.7, 23.2, …
$ HUM <int> 70, 79, 79, 81, 81, 84, 84, 84, 84, 84, 81, 82, 68, 72, 68, 66, 71, 68, 75, 72, 73, 68, 68, 68, 90, 88…
$ PNM <dbl> 1020.8, 1020.9, 1020.9, 1020.4, 1020.0, 1020.3, 1020.7, 1021.1, 1021.6, 1022.2, 1022.4, 1021.9, 1021.7…
$ DD <int> 70, 70, 70, 70, 70, 50, 70, 50, 50, 50, 50, 50, 50, 70, 70, 70, 70, 70, 90, 90, 90, 90, 70, 70, 50, 50…
$ FF <int> 28, 28, 26, 26, 22, 22, 17, 17, 15, 19, 20, 24, 20, 19, 20, 17, 15, 15, 19, 20, 22, 22, 31, 35, 17, 19…
$ NOMBRE <chr> "AEROPARQUE AERO", "AEROPARQUE AERO", "AEROPARQUE AERO", "AEROPARQUE AERO", "AEROPARQUE AERO", "AEROPA…
Observamos que la variable que representa la velocidad del viento en km/h (FF) es de tipo Char, por lo cual debemos convertila en int.
horarios$FF <- as.numeric(horarios$FF)
Observamos el resumen estadistico de las variables de potencial interes
print("altura")
[1] "altura"
summary(estaciones$ALTURA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.0 53.5 145.0 331.5 455.0 3459.0
print("hora")
[1] "hora"
summary(horarios$HORA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 12.16 18.00 23.00
print("temperatura")
[1] "temperatura"
summary(horarios$TEMP)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
-20.00 16.10 20.20 19.09 23.80 33.60 1
print("humedad")
[1] "humedad"
summary(horarios$HUM)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
18.00 59.00 72.00 71.18 85.00 100.00 4
print("presion atmosferica")
[1] "presion atmosferica"
summary(horarios$PNM)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
960.8 1011.0 1015.1 1075.3 1019.2 3133.0 23
print("direccion del viento")
[1] "direccion del viento"
summary(horarios$DD)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.0 50.0 70.0 118.9 160.0 990.0 1
print("velocidad del viento")
[1] "velocidad del viento"
summary(horarios$FF)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.00 9.00 15.00 16.06 22.00 107.00 1
Se puede observar la presencia de observaciones NA’s, por lo cual procedemos a eliminarlas
which(is.na(horarios), arr.ind=TRUE)
row col
[1,] 771 3
[2,] 587 4
[3,] 771 4
[4,] 1876 4
[5,] 1877 4
[6,] 237 5
[7,] 771 5
[8,] 973 5
[9,] 974 5
[10,] 975 5
[11,] 976 5
[12,] 977 5
[13,] 978 5
[14,] 979 5
[15,] 980 5
[16,] 1392 5
[17,] 1393 5
[18,] 1394 5
[19,] 1395 5
[20,] 1396 5
[21,] 1397 5
[22,] 1398 5
[23,] 1399 5
[24,] 1400 5
[25,] 1401 5
[26,] 1402 5
[27,] 1403 5
[28,] 1441 5
[29,] 771 6
[30,] 771 7
horarios[771,]
Como toda la fila para la rioja aero es NA, lla eliminamos
horarios <- horarios[-c(771), ]
rownames(horarios) <- 1:nrow(horarios)
horarios[587,]
Completamos la columna humedad con el promedio
# FORMOSA AERO
formosa <- horarios[horarios$NOMBRE == "FORMOSA AERO",]
which(is.na(formosa), arr.ind=TRUE)
row col
587 4 4
formosa <- na.omit(formosa)
horarios[587,4] = mean(formosa$HUM)
horarios[587,]
remove(formosa)
which(is.na(horarios), arr.ind=TRUE)
row col
1875 1875 4
1876 1876 4
237 237 5
972 972 5
973 973 5
974 974 5
975 975 5
976 976 5
977 977 5
978 978 5
979 979 5
1391 1391 5
1392 1392 5
1393 1393 5
1394 1394 5
1395 1395 5
1396 1396 5
1397 1397 5
1398 1398 5
1399 1399 5
1400 1400 5
1401 1401 5
1402 1402 5
1440 1440 5
horarios[c(1875,1876),]
Completamos con el promedio de humedad calculado para tucuman aero
# TUCUMAN AERO
tucuman_aero <- horarios[horarios$NOMBRE == "TUCUMAN AERO",]
which(is.na(tucuman_aero), arr.ind=TRUE)
row col
1875 1 4
1876 2 4
tucuman_aero <- na.omit(tucuman_aero)
horarios[1875,4] = mean(tucuman_aero$HUM)
horarios[1876,4] = mean(tucuman_aero$HUM)
horarios[c(1875,1876),]
remove(tucuman_aero)
horarios[237,]
# CATAMARCA AERO
catamarca_aero <- horarios[horarios$NOMBRE == "CATAMARCA AERO",]
which(is.na(catamarca_aero), arr.ind=TRUE)
row col
237 2 5
catamarca_aero <- na.omit(catamarca_aero)
horarios[237,5] = mean(catamarca_aero$PNM)
horarios[237,]
remove(catamarca_aero)
horarios[c(972,973,974,975,976,977,978,979),]
# MERCEDES AERO
mercedes_aero <- horarios[horarios$NOMBRE == "MERCEDES AERO",]
which(is.na(mercedes_aero), arr.ind=TRUE)
row col
972 1 5
973 2 5
974 3 5
975 4 5
976 5 5
977 6 5
978 7 5
979 8 5
remove(mercedes_aero)
Vamos a completar los valores faltantes para la presion atmosferica con el promedio de la provincia de corrientes a la que pertenece Mercedes aero
corrientes_aero <- horarios[(horarios$NOMBRE == "CORRIENTES AERO") | (horarios$NOMBRE == "ITUZAINGO") | (horarios$NOMBRE == "MONTE CASEROS AERO") | (horarios$NOMBRE == "PASO DE LOS LIBRES AERO"),]
which(is.na(corrientes_aero), arr.ind=TRUE)
row col
horarios[972,5] = mean(corrientes_aero$PNM)
horarios[973,5] = mean(corrientes_aero$PNM)
horarios[974,5] = mean(corrientes_aero$PNM)
horarios[975,5] = mean(corrientes_aero$PNM)
horarios[976,5] = mean(corrientes_aero$PNM)
horarios[977,5] = mean(corrientes_aero$PNM)
horarios[978,5] = mean(corrientes_aero$PNM)
horarios[979,5] = mean(corrientes_aero$PNM)
horarios[c(972,973,974,975,976,977,978,979),]
remove(corrientes_aero)
horarios[c(1391,1392,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403),]
# RIO GALLEGOS AERO
gallegos_aero <- horarios[horarios$NOMBRE == "RIO GALLEGOS AERO",]
which(is.na(gallegos_aero), arr.ind=TRUE)
row col
1391 1 5
1392 2 5
1393 3 5
1394 4 5
1395 5 5
1396 6 5
1397 7 5
1398 8 5
1399 9 5
1400 10 5
1401 11 5
1402 12 5
gallegos_aero <- na.omit(gallegos_aero)
horarios[1391,5] = mean(gallegos_aero$PNM)
horarios[1392,5] = mean(gallegos_aero$PNM)
horarios[1393,5] = mean(gallegos_aero$PNM)
horarios[1394,5] = mean(gallegos_aero$PNM)
horarios[1395,5] = mean(gallegos_aero$PNM)
horarios[1396,5] = mean(gallegos_aero$PNM)
horarios[1397,5] = mean(gallegos_aero$PNM)
horarios[1398,5] = mean(gallegos_aero$PNM)
horarios[1399,5] = mean(gallegos_aero$PNM)
horarios[1400,5] = mean(gallegos_aero$PNM)
horarios[1401,5] = mean(gallegos_aero$PNM)
horarios[1402,5] = mean(gallegos_aero$PNM)
horarios[1403,5] = mean(gallegos_aero$PNM)
horarios[c(1391,1392,1394,1395,1396,1397,1398,1399,1400,1401,1402,1403),]
remove(gallegos_aero)
which(is.na(horarios), arr.ind=TRUE)
row col
1440 1440 5
horarios[1440,]
# RIO GRANDE
rio_grande_aero <- horarios[horarios$NOMBRE == "RIO GRANDE B.A.",]
which(is.na(rio_grande_aero), arr.ind=TRUE)
row col
1440 26 5
rio_grande_aero <- na.omit(rio_grande_aero)
horarios[1440,5] = mean(rio_grande_aero$PNM)
horarios[1440,]
remove(rio_grande_aero)
Validamos que efectivamente se hayan arreglado
print("temperatura")
[1] "temperatura"
summary(horarios$TEMP)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-20.00 16.10 20.20 19.09 23.80 33.60
print("humedad")
[1] "humedad"
summary(horarios$HUM)
Min. 1st Qu. Median Mean 3rd Qu. Max.
18.00 59.00 72.00 71.19 85.00 100.00
print("presion atmosferica")
[1] "presion atmosferica"
summary(horarios$PNM)
Min. 1st Qu. Median Mean 3rd Qu. Max.
960.8 1011.1 1015.1 1074.7 1019.1 3133.0
print("direccion del viento")
[1] "direccion del viento"
summary(horarios$DD)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 50.0 70.0 118.9 160.0 990.0
print("velocidad del viento")
[1] "velocidad del viento"
summary(horarios$FF)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 9.00 15.00 16.06 22.00 107.00
Perfecto, ya no tenemos valores nulos en nuestro dataset.
A continuación, visualizaremos las distribuciones de las variables numericas de interés
hist(estaciones$ALTURA, main = "Histograma de la altura", xlab = "Altura")
hist(horarios$HORA, main = "Histograma de horarios", xlab = "Horarios")
hist(horarios$TEMP, main = "Histograma de temperatura", xlab = "Temperatura")
hist(horarios$HUM, main = "Histograma de humedad", xlab = "Humedad")
hist(horarios$PNM, main = "Histograma de presion atmosferica", xlab = "Presion Atmosferica")
hist(horarios$DD, main = "Histograma de la direccion del viento", xlab = "Direccion del viento")
hist(horarios$FF, main = "Histograma de la velocidad del viento", xlab = "Velocidad del viento")
Creamos unos boxplot para visualizar la distribucionde datos que potencialmente nos interecen para proceder con el analisis
boxplot(estaciones$ALTURA, main = "Boxplot de la altura", xlab = "Altura")
boxplot(horarios$HORA, main = "Boxplot de horarios", xlab = "Horarios")
boxplot(horarios$TEMP, main = "Boxplot de temperatura", xlab = "Temperatura")
boxplot(horarios$HUM, main = "Boxplot de humedad", xlab = "Humedad")
boxplot(horarios$PNM, main = "Boxplot de presion atmosferica", xlab = "Presion Atmosferica")
boxplot(horarios$DD, main = "Boxplot de la direccion del viento", xlab = "Direccion del viento")
boxplot(horarios$FF, main = "Boxplot de la velocidad del viento", xlab = "Velocidad del viento")
Los boxplots anteriores ponen en evidencia la existencia de outliers. ¿Pero son estos realmente outliers, o pertenecen a observaciones en lugares muy remotos? Esto lo analizaremos luego, al momento de graficar las estaciones en el mapa de Argentina.
Ahora, veamos que tan correlacionadas estan estas variables.
corr <- cor(horarios[, c(2,3,4,5,6,7)], use = "complete.obs")
ggcorrplot(corr, type = "lower", outline.col = "black",
lab=TRUE,
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
Las variables que mas correlacionan con la velocidad del viento son HUMEDAD (negativamente) y HORA (positivamente). Tambien vemos que HORA y TEMPERATURA correlacionan negativamentecon HUMEDAD. Por ultimo, se observa que HORA y TEMPERATURA correlacionan positivamente
Analizamos ahora la simetria de la variable que representa la velocidad del viento ya que es la que mas nos interesa en este estudio.
skew(horarios$FF)
[1] 1.588087
kurtosi(horarios$FF)
[1] 7.188948
La medida de asimetria y kurtosi terminan de validar lo que observamos en el histograma. La variable FF es asimetrica a derecha y tiene una mayor concentracion de valores muy cerca de la media de la distribución y muy lejos de la cola de la distribucion.
Un detalle no menor del dataset de estaciones es que las latitudes y longitudes estan expresadads en grados y minutos. Para poder trabajar con ellas, necesitamos que esten expresadas en valores decimales. Por eso, en el siguiente bloque de código vamos a usar la funcion dms2dd para hacer esta conversión.
# creamos dos vectores vacios
latitud <- c()
longitud <- c()
# iteramos por cada fila del dataset de estaciones y hacemos la convesion de latitud y longitud
for(i in 1:nrow(estaciones)) {
latitud[i] <- dms2dd(dd = estaciones[i, "LATITUD_GRADOS"], mm = estaciones[i, "LATITUD_MINUTOS"], ss = 0, ns = "S")
longitud[i] <- dms2dd(dd = estaciones[i, "LONGITUD_GRADOS"], mm = estaciones[i, "LONGITUD_MINUTOS"], ss = 0, ns = "S")
}
# asignamos a latitud y longitud los valores convertidos
estaciones['LATITUD'] <- latitud
estaciones['LONGITUD'] <- longitud
Antes de unir estaciones, se removeran las columnas que no sean relevantes para este analisis. Las mismas son NRO y NroOACI, LATITUD_GRADOS, LATITUD_MINUTOS, LONGITUD_GRADOS, LONGITUD_MINUTOS. Asi como tambien se removera la variable fecha, ya que estos datos pertenecen al 20/04/2021
estaciones <- estaciones[c(1,2,7,10,11)]
horarios <- horarios[,c(3,4,5,6,7,8)]
summary(estaciones)
NOMBRE PROVINCIA ALTURA LATITUD LONGITUD
Length:123 Length:123 Min. : 3.0 Min. :-77.87 Min. :-72.05
Class :character Class :character 1st Qu.: 53.5 1st Qu.:-38.17 1st Qu.:-66.47
Mode :character Mode :character Median : 145.0 Median :-34.53 Median :-63.37
Mean : 331.5 Mean :-35.96 Mean :-62.82
3rd Qu.: 455.0 3rd Qu.:-30.25 3rd Qu.:-58.93
Max. :3459.0 Max. :-22.10 Max. :-34.57
summary(horarios$TEMP)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-20.00 16.10 20.20 19.09 23.80 33.60
Se procede a unificar las dos tablas usando la variable NOMBRE como punto para combinar los datasets
data <- inner_join(estaciones, horarios, by = c("NOMBRE" = "NOMBRE"))
glimpse(data)
Rows: 2,040
Columns: 10
$ NOMBRE <chr> "BASE BELGRANO II", "BASE BELGRANO II", "BASE BELGRANO II", "BASE BELGRANO II", "BASE BELGRANO II",…
$ PROVINCIA <chr> "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTIDA", "ANTARTI…
$ ALTURA <int> 256, 256, 256, 256, 256, 256, 256, 256, 11, 11, 11, 11, 11, 11, 11, 11, 24, 24, 24, 24, 24, 24, 24,…
$ LATITUD <dbl> -77.86667, -77.86667, -77.86667, -77.86667, -77.86667, -77.86667, -77.86667, -77.86667, -62.23333, …
$ LONGITUD <dbl> -34.56667, -34.56667, -34.56667, -34.56667, -34.56667, -34.56667, -34.56667, -34.56667, -58.63333, …
$ TEMP <dbl> -20.0, -16.2, -16.9, -13.0, -15.1, -16.6, -15.3, -16.8, -0.1, -0.5, -0.9, -0.5, 0.3, 1.1, 1.3, 2.5,…
$ HUM <dbl> 70, 72, 80, 73, 86, 66, 77, 85, 88, 82, 77, 75, 84, 92, 99, 91, 43, 49, 67, 81, 67, 62, 77, 83, 74,…
$ PNM <dbl> 975.7, 970.4, 966.7, 964.2, 961.4, 963.0, 964.8, 967.1, 992.9, 993.0, 992.9, 994.6, 995.3, 995.6, 9…
$ DD <int> 140, 140, 110, 140, 160, 160, 90, 110, 230, 230, 270, 290, 290, 320, 320, 320, 160, 200, 50, 200, 1…
$ FF <dbl> 13, 30, 50, 52, 74, 61, 44, 13, 15, 15, 26, 28, 41, 37, 50, 74, 20, 7, 13, 7, 11, 46, 56, 74, 19, 2…
summary(data$NOMBRE)
Length Class Mode
2040 character character
summary(data$PROVINCIA)
Length Class Mode
2040 character character
summary(data$ALTURA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.0 43.0 125.0 309.2 450.0 3459.0
summary(data$LATITUD)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-77.87 -37.63 -34.13 -35.47 -30.27 -22.10
summary(data$LONGITUD)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-72.05 -66.28 -63.02 -62.85 -58.73 -34.57
summary(data$TEMP)
Min. 1st Qu. Median Mean 3rd Qu. Max.
-20.00 16.10 20.20 19.09 23.80 33.60
summary(data$HUM)
Min. 1st Qu. Median Mean 3rd Qu. Max.
18.00 59.00 72.00 71.19 85.00 100.00
summary(data$PNM)
Min. 1st Qu. Median Mean 3rd Qu. Max.
960.8 1011.1 1015.1 1074.7 1019.1 3133.0
summary(data$DD)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 50.0 70.0 118.9 160.0 990.0
summary(data$FF)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 9.00 15.00 16.06 22.00 107.00
Volvemos realizar el grafico de correlacion incluyendo la variable altura.
corr <- cor(data[, c(3,4,5,6,7,8,9,10)], use = "complete.obs")
ggcorrplot(corr, type = "lower", outline.col = "black",
lab=TRUE,
ggtheme = ggplot2::theme_gray,
colors = c("#6D9EC1", "white", "#E46726"))
Se observa que hay estaciones que tiene las observaciones en 0 para la variable FF y DD. Consideramos esto como un error en el instrumento de medicion, por lo cual vamos a eliminar a esa estacion del analisis.
data = data[(data$FF != 0) & (data$DD != 0),]
rownames(data) <- 1:nrow(data)
Agrupamos los datos por nombre calculando el promedio y desvio del viento
data_agg = data %>%
group_by(NOMBRE) %>%
summarise(MEAN_VIENTO_KMH = mean(FF),
SD_VIENTO_KMH = sd(FF),
LONGITUD = unique(LONGITUD),
LATITUD = unique(LATITUD),
.groups = "keep")
Vemos que en el dataset resultante nos quedan 98 observaciones que coinciden con la cantidad de estaciones meteorologicas originales
Convertimos data_agg a data.frame ya que necesitamos este tipo de dato para poder trabajar
data_agg = data.frame(data_agg)
Transformamos df_data_agg en un archivo geográfico utilizando el código de proyección mercator
data_agg_sf = st_as_sf(data_agg, coords = c("LONGITUD", "LATITUD"), crs = 4326)
Validamos la clase del nuevo dataframe
class(data_agg_sf)
[1] "sf" "data.frame"
En el siguiente gráfico de la republica argentina se observan en color azul las estaciones meteorológicas donde se realizaron las mediciones de la variables que estan presentes en el dataset
Queremos que en el mapa se vea como etiqueta el nombre de la base meteorologica. Para eso aplicamos la siguiente funcion
labs <- lapply(seq(nrow(data_agg_sf)), function(i) {
paste0( '<p>', data_agg_sf[i, "NOMBRE"], '<p>', '<p>',data_agg_sf[i, "MEAN_VIENTO_KMH"],'</p>' ) })
Realizamos el grafico interactivo de las estaciones meteorologicas graficadas sobre un mapa de Argentina.
leaflet() %>%
addTiles() %>%
addCircles(data = data_agg_sf, weight = 3, label = lapply(labs, htmltools::HTML))
NA
En el mapa observamos que hay puntos muy distantes de la Argentina continental. Dado el proposito de este estudio, el cual es determinar la ubicacion geografica óptima en base a la variable velocidad del viento, decidimos remover estas observaciones ya que no aporta informacion util y ademas agregan ruido a nuestro analisis.
Primero, vamos a borrar las estaciones que no estan en la plataforma continental argentina - Base Carlini - Base San Martin - Base Marambio - Base Esperanza - Base Orcadas
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE CARLINI (EX JUBANY)",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE SAN MARTIN",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE MARAMBIO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE ESPERANZA",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE ORCADAS",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "BASE BELGRANO II",]
Repetimos el plot para validar
labs <- lapply(seq(nrow(data_agg_sf)), function(i) {
paste0( '<p>', data_agg_sf[i, "NOMBRE"], '</p>' ) })
leaflet() %>%
addTiles() %>%
addCircles(data = data_agg_sf, weight = 3, label = lapply(labs, htmltools::HTML))
NA
Observemos el resumen estadistico de las nuevas variables MEAN_VIENTO_KMH y SD_VIENTO_KMH
describe(data_agg_sf$MEAN_VIENTO_KMH)
hist(data_agg_sf$MEAN_VIENTO_KMH)
boxplot(data_agg_sf$MEAN_VIENTO_KMH)
describe(data_agg_sf$SD_VIENTO_KMH)
hist(data_agg_sf$SD_VIENTO_KMH)
boxplot(data_agg_sf$SD_VIENTO_KMH)
Veamos si esta nueva variable MEAN_VIENTO_KMH es normal.
hist(data_agg_sf$MEAN_VIENTO_KMH)
boxplot(data_agg_sf$MEAN_VIENTO_KMH)
qqnorm(data_agg_sf$MEAN_VIENTO_KMH)
qqline(data_agg_sf$MEAN_VIENTO_KMH, col=2)
shapiro.test(data_agg_sf$MEAN_VIENTO_KMH)
Shapiro-Wilk normality test
data: data_agg_sf$MEAN_VIENTO_KMH
W = 0.94243, p-value = 9.648e-05
Claramente la variable MEAN_VIENTO_KMH no es normal. El qqplot pone en evidencia la existencia de colas pesadas. Ademas, al realizar el test de shapiro wilk el p-value obtenido es menor a 0.05, lo cual indica que los datos que tenemos no son normales #TODO: agregar algo mas a esta conclusion?
Ahora, procedemos a analizar la existencia de inliers, y en el caso de encontrarlos, eliminarlos.Para eso, usamos el test de moran. Basicamente lo que estamos testeando es que el promedio del viento este dristribuido de manera aleatoria siguiendo un proceso aleatorio.
knea <- knearneigh(data_agg_sf)
neib <- knn2nb(knea)
listw <- nb2listw(neib)
moran_test <- moran.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
moran_test
Moran I test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Moran I statistic standard deviate = 3.8355, p-value = 6.265e-05
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.423564794 -0.008849558 0.012710177
geary_test <- geary.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
geary_test
Geary C test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Geary C statistic standard deviate = 4.083, p-value = 2.223e-05
alternative hypothesis: Expectation greater than statistic
sample estimates:
Geary C statistic Expectation Variance
0.46134647 1.00000000 0.01740455
shaphiro_test <- shapiro.test(data_agg_sf$MEAN_VIENTO_KMH)
shaphiro_test
Shapiro-Wilk normality test
data: data_agg_sf$MEAN_VIENTO_KMH
W = 0.94243, p-value = 9.648e-05
Vemos que los datos no son normales, hacemos el grafico de moran para identificar los inliers
moran <- moran.plot(data_agg_sf$MEAN_VIENTO_KMH, listw = listw)
removemos la primera capa de inliers
data_agg_sf[10,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -65.76667 ymin: -28.6 xmax: -65.76667 ymax: -28.6
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
16 CATAMARCA AERO 47.5 11.19949 POINT (-65.76667 -28.6)
data_agg_sf[31,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -63.75 ymin: -35.7 xmax: -63.75 ymax: -35.7
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
37 GENERAL PICO AERO 30.25 4.234777 POINT (-63.75 -35.7)
data_agg_sf[79,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -64.23333 ymin: -33.11667 xmax: -64.23333 ymax: -33.11667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
85 RIO CUARTO AERO 32.5 7.607014 POINT (-64.23333 -33.11667)
data_agg_sf[108,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -61.96667 ymin: -33.66667 xmax: -61.96667 ymax: -33.66667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
114 VENADO TUERTO AERO 32 11.09955 POINT (-61.96667 -33.66667)
data_agg_sf[68,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -62.38333 ymin: -37.6 xmax: -62.38333 ymax: -37.6
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
74 PIGUE AERO 38.23077 6.905962 POINT (-62.38333 -37.6)
data_agg_sf[21,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -61.88333 ymin: -37.43333 xmax: -61.88333 ymax: -37.43333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
27 CORONEL SUAREZ AERO 27.33333 5.278889 POINT (-61.88333 -37.43333)
data_agg_sf[89,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -66.35 ymin: -33.26667 xmax: -66.35 ymax: -33.26667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
95 SAN LUIS AERO 17 7.901568 POINT (-66.35 -33.26667)
data_agg_sf[23,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -57.73333 ymin: -36.35 xmax: -57.73333 ymax: -36.35
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
29 DOLORES AERO 15.83333 6.105355 POINT (-57.73333 -36.35)
data_agg_sf[109,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -65.43333 ymin: -36.21667 xmax: -65.43333 ymax: -36.21667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
115 VICTORICA 14 8.660254 POINT (-65.43333 -36.21667)
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "CATAMARCA AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "GENERAL PICO AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "RIO CUARTO AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "VENADO TUERTO AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "PIGUE AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "CORONEL SUAREZ AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "SAN LUIS AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "DOLORES AERO",]
data_agg_sf = data_agg_sf[data_agg_sf$NOMBRE != "VICTORICA",]
rownames(data_agg_sf) <- 1:nrow(data_agg_sf)
Verificamos los resultados dsp de eliminar inliers
# Creamos una lista de vecinos
knea <- knearneigh(data_agg_sf)
neib <- knn2nb(knea)
listw <- nb2listw(neib)
# Hacemos el test de moran
moran_test_v2 <- moran.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
moran_test_v2
Moran I test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Moran I statistic standard deviate = 4.1509, p-value = 1.656e-05
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.483041908 -0.009615385 0.014086783
geary_test_v2 <- geary.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
geary_test_v2
Geary C test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Geary C statistic standard deviate = 4.6029, p-value = 2.083e-06
alternative hypothesis: Expectation greater than statistic
sample estimates:
Geary C statistic Expectation Variance
0.41937891 1.00000000 0.01591176
shaphiro_test_v2 <- shapiro.test(data_agg_sf$MEAN_VIENTO_KMH)
shaphiro_test_v2
Shapiro-Wilk normality test
data: data_agg_sf$MEAN_VIENTO_KMH
W = 0.96597, p-value = 0.008459
moran.plot(data_agg_sf$MEAN_VIENTO_KMH, listw = listw)
qqnorm(data_agg_sf$MEAN_VIENTO_KMH)
qqline(data_agg_sf$MEAN_VIENTO_KMH, col=2)
Vemos que aun no logramos normalidad. Removemos la segunda capa de inliers. Creamos un dataset auxiliar para continuar removiendo las observaciones.
data_agg_sf_clean = data_agg_sf
data_agg_sf_clean[56,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -55.13333 ymin: -27.48333 xmax: -55.13333 ymax: -27.48333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
56 OBERA 4 0 POINT (-55.13333 -27.48333)
data_agg_sf_clean[96,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -62.73333 ymin: -35.96667 xmax: -62.73333 ymax: -35.96667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
96 TRENQUE LAUQUEN 10.33333 3.05505 POINT (-62.73333 -35.96667)
data_agg_sf_clean[39,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -63.36667 ymin: -34.13333 xmax: -63.36667 ymax: -34.13333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
39 LABOULAYE AERO 23.875 8.331775 POINT (-63.36667 -34.13333)
data_agg_sf_clean[57,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -60.21667 ymin: -36.88333 xmax: -60.21667 ymax: -36.88333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
57 OLAVARRIA AERO 26.625 6.020797 POINT (-60.21667 -36.88333)
data_agg_sf_clean[105,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -65.38333 ymin: -33.73333 xmax: -65.38333 ymax: -33.73333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
105 VILLA REYNOLDS AERO 30.95833 16.53581 POINT (-65.38333 -33.73333)
data_agg_sf_clean[69,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -57.28333 ymin: -35.36667 xmax: -57.28333 ymax: -35.36667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
69 PUNTA INDIO B.A. 29 4.434712 POINT (-57.28333 -35.36667)
data_agg_sf_clean[3,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -62.16667 ymin: -38.73333 xmax: -62.16667 ymax: -38.73333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
3 BAHIA BLANCA AERO 28.58333 6.439799 POINT (-62.16667 -38.73333)
data_agg_sf_clean[62,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -61.9 ymin: -35.86667 xmax: -61.9 ymax: -35.86667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
62 PEHUAJO AERO 26.4375 6.087898 POINT (-61.9 -35.86667)
data_agg_sf_clean[86,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -64.26667 ymin: -36.56667 xmax: -64.26667 ymax: -36.56667
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
86 SANTA ROSA AERO 30.16667 4.992748 POINT (-64.26667 -36.56667)
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "OBERA",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "TRENQUE LAUQUEN",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "LABOULAYE AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "OLAVARRIA AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "VILLA REYNOLDS AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "PUNTA INDIO B.A.",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "BAHIA BLANCA AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "PEHUAJO AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "SANTA ROSA AERO",]
rownames(data_agg_sf_clean) <- 1:nrow(data_agg_sf_clean)
Luego de eliminar, volvemos a checkear si nos da normalidad
# Creamos una lista de vecinos
knea <- knearneigh(data_agg_sf_clean)
neib <- knn2nb(knea)
listw <- nb2listw(neib)
# Hacemos el test de moran
moran_test_v2 <- moran.test(data_agg_sf_clean$MEAN_VIENTO_KMH, listw)
moran_test_v2
Moran I test under randomisation
data: data_agg_sf_clean$MEAN_VIENTO_KMH
weights: listw
Moran I statistic standard deviate = 4.6639, p-value = 1.552e-06
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.57387574 -0.01052632 0.01570120
geary_test_v2 <- geary.test(data_agg_sf_clean$MEAN_VIENTO_KMH, listw)
geary_test_v2
Geary C test under randomisation
data: data_agg_sf_clean$MEAN_VIENTO_KMH
weights: listw
Geary C statistic standard deviate = 4.7143, p-value = 1.213e-06
alternative hypothesis: Expectation greater than statistic
sample estimates:
Geary C statistic Expectation Variance
0.38036555 1.00000000 0.01727558
shaphiro_test_v2 <- shapiro.test(data_agg_sf_clean$MEAN_VIENTO_KMH)
shaphiro_test_v2
Shapiro-Wilk normality test
data: data_agg_sf_clean$MEAN_VIENTO_KMH
W = 0.96445, p-value = 0.0105
moran.plot(data_agg_sf_clean$MEAN_VIENTO_KMH, listw = listw)
hist(data_agg_sf_clean$MEAN_VIENTO_KMH)
qqnorm(data_agg_sf_clean$MEAN_VIENTO_KMH)
qqline(data_agg_sf_clean$MEAN_VIENTO_KMH, col=2)
removemos la tercera capa de inliers
data_agg_sf_clean[31,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -68.75 ymin: -30.23333 xmax: -68.75 ymax: -30.23333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
31 JACHAL 4 NA POINT (-68.75 -30.23333)
data_agg_sf_clean[39,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -60.58333 ymin: -24.7 xmax: -60.58333 ymax: -24.7
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
39 LAS LOMITAS 4.4375 2.064582 POINT (-60.58333 -24.7)
data_agg_sf_clean[90,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -65.1 ymin: -26.85 xmax: -65.1 ymax: -26.85
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
90 TUCUMAN AERO 8.944444 2.235337 POINT (-65.1 -26.85)
data_agg_sf_clean[22,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -61.66667 ymin: -32.2 xmax: -61.66667 ymax: -32.2
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
22 EL TREBOL 14.4 6.14817 POINT (-61.66667 -32.2)
data_agg_sf_clean[84,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -59.25 ymin: -37.23333 xmax: -59.25 ymax: -37.23333
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
84 TANDIL AERO 25.95833 6.772574 POINT (-59.25 -37.23333)
data_agg_sf_clean[95,]
Simple feature collection with 1 feature and 3 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -65.13333 ymin: -31.95 xmax: -65.13333 ymax: -31.95
Geodetic CRS: WGS 84
NOMBRE MEAN_VIENTO_KMH SD_VIENTO_KMH geometry
95 VILLA DOLORES AERO 26.04762 13.80752 POINT (-65.13333 -31.95)
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "JACHAL",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "LAS LOMITAS",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "TUCUMAN AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "EL TREBOL",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "TANDIL AERO",]
data_agg_sf_clean = data_agg_sf_clean[data_agg_sf_clean$NOMBRE != "VILLA DOLORES AERO",]
rownames(data_agg_sf_clean) <- 1:nrow(data_agg_sf_clean)
Checkeamos si logramos normalidad
# Creamos una lista de vecinos
knea <- knearneigh(data_agg_sf_clean)
neib <- knn2nb(knea)
listw <- nb2listw(neib)
# Hacemos el test de moran
moran_test_v3 <- moran.test(data_agg_sf_clean$MEAN_VIENTO_KMH, listw)
moran_test_v3
Moran I test under randomisation
data: data_agg_sf_clean$MEAN_VIENTO_KMH
weights: listw
Moran I statistic standard deviate = 4.169, p-value = 1.53e-05
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.52658646 -0.01123596 0.01664269
geary_test_v3 <- geary.test(data_agg_sf_clean$MEAN_VIENTO_KMH, listw)
geary_test_v3
Geary C test under randomisation
data: data_agg_sf_clean$MEAN_VIENTO_KMH
weights: listw
Geary C statistic standard deviate = 4.5108, p-value = 3.229e-06
alternative hypothesis: Expectation greater than statistic
sample estimates:
Geary C statistic Expectation Variance
0.39098526 1.00000000 0.01822843
shaphiro_test_v3 <- shapiro.test(data_agg_sf_clean$MEAN_VIENTO_KMH)
shaphiro_test_v3
Shapiro-Wilk normality test
data: data_agg_sf_clean$MEAN_VIENTO_KMH
W = 0.9569, p-value = 0.004576
moran.plot(data_agg_sf_clean$MEAN_VIENTO_KMH, listw = listw)
hist(data_agg_sf_clean$MEAN_VIENTO_KMH)
qqnorm(data_agg_sf_clean$MEAN_VIENTO_KMH)
qqline(data_agg_sf_clean$MEAN_VIENTO_KMH, col=2)
De acuerdo a los resultados anteriores, concluimos que la distribucion de los datos no es normal. Seguimos trabajando con el dataset data_agg_sf que tiene una sola capa de limpieza de inliers.
# calculamos el coeficiente de moran
knea <- knearneigh(data_agg_sf)
neib <- knn2nb(knea)
listw <- nb2listw(neib)
moran(data_agg_sf$MEAN_VIENTO_KMH, listw, length(listw$weights),Szero(listw),zero.policy = FALSE)
$I
[1] 0.4830419
$K
[1] 2.318574
moran_test_final <- moran.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
moran_test_final
Moran I test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Moran I statistic standard deviate = 4.1509, p-value = 1.656e-05
alternative hypothesis: greater
sample estimates:
Moran I statistic Expectation Variance
0.483041908 -0.009615385 0.014086783
geary_test_final <- geary.test(data_agg_sf$MEAN_VIENTO_KMH, listw)
geary_test_final
Geary C test under randomisation
data: data_agg_sf$MEAN_VIENTO_KMH
weights: listw
Geary C statistic standard deviate = 4.6029, p-value = 2.083e-06
alternative hypothesis: Expectation greater than statistic
sample estimates:
Geary C statistic Expectation Variance
0.41937891 1.00000000 0.01591176
Tanto el coeficiente I de Moran([-1,1]) como el indicador de continuidad C de Geary ([0,2]) demuestran que los datos presentan una autocorrelacion positiva en donde se puede decir que tienden a una concentracion espacial.
library(sfheaders)
data_agg_v2 = sf_to_df(data_agg_sf, fill = TRUE)
data_agg_v2 = data_agg_v2[,c(2,6,7)]
coordinates(data_agg_v2) <- ~x+y
data_agg_v2_geo<-as.geodata(data_agg_v2)
plot(data_agg_v2_geo)
Al analizar estos graficos obtenidos, concluimos que pareciera exisir una leve tendencia negativa a medida que aumenta los valores en el eje de la coordenada Y
data_variogram = sf_to_df(data_agg_sf, fill = TRUE)
data_variogram <- data_variogram %>% dplyr::select(MEAN_VIENTO_KMH,x,y)
coordinates(data_variogram) <- ~x+y
v <- variogram(MEAN_VIENTO_KMH~1, data_variogram)
plot(v)
v_alpha <- variogram(MEAN_VIENTO_KMH~1, data_variogram, alpha = c(0, 45, 90, 135))
plot(v_alpha)
v_cloud <- variogram(MEAN_VIENTO_KMH~1, data_variogram, cloud=TRUE)
plot(v_cloud)
# Elegimos a ojo el valor del cutoff
plot(variogram(MEAN_VIENTO_KMH~1, data_variogram, cutoff=20, width=30, map=T))
plot(variogram(MEAN_VIENTO_KMH~1, data_variogram, cutoff=10, width=5, map=T))
plot(variogram(MEAN_VIENTO_KMH~1, data_variogram, cutoff=10, width=1, map=T))
plot(variogram(MEAN_VIENTO_KMH~1, data_variogram, cutoff=40, width=2, map=T))
A partir del segundo gráfico, estamos en presencia de anisotropia en nuestro variograma. A partir del ultimo grafico del variograma, concluimos que tenemos una direccionalidad del sudoeste al noreste.
Ajustamos contra el variograma empirico
vt_lin = fit.variogram(v, vgm(60, "Lin", 15, 14))
singular model in variogram fit
vt_lin
plot(v , vt_lin)
vt_sph = fit.variogram(v, vgm(60, "Sph", 15, 15))
vt_sph
plot(v , vt_sph)
vt_exp = fit.variogram(v, vgm(70, "Exp", 15, 15))
vt_exp
plot(v , vt_exp)
vt_mat = fit.variogram(v, vgm(70, "Mat", 15, 15))
plot(v , vt_mat)
vt_exc = fit.variogram(v, vgm(75, "Exc", 15, 0))
No convergence after 200 iterations: try different initial values?
plot(v , vt_exc)
attr(vt_lin, 'SSErr')
[1] 8611.196
attr(vt_exp, 'SSErr')
[1] 4021.34
attr(vt_mat, 'SSErr')
[1] 4021.34
attr(vt_exc, 'SSErr')
[1] 3525.526
Probemos con tendencia
v_tendencia_v1 <- variogram(MEAN_VIENTO_KMH~y+x, data_variogram)
plot(v_tendencia_v1)
v_alpha <- variogram(MEAN_VIENTO_KMH~y+x, data_variogram, alpha = c(0, 45, 90, 135))
plot(v_alpha)
v_cloud <- variogram(MEAN_VIENTO_KMH~y+x, data_variogram, cloud=TRUE)
plot(v_cloud)
# Elegimos a ojo el valor del cutoff
plot(variogram(MEAN_VIENTO_KMH~y+x, data_variogram, cutoff=20, width=30, map=T))
plot(variogram(MEAN_VIENTO_KMH~y+x, data_variogram, cutoff=10, width=5, map=T))
plot(variogram(MEAN_VIENTO_KMH~y+x, data_variogram, cutoff=40, width=5, map=T))
plot(variogram(MEAN_VIENTO_KMH~y+x, data_variogram, cutoff=40, width=2, map=T))
Mirando el segundo grafico, observamos una mayor autocorrelacion mas fuerte a los 90 y 45 grados, y una autocorrelacion menos fuerte a los 135 y 0 grados. A partir del ultimo grafico del variograma, concluimos que tenemos una direccionalidad del sudoeste al noreste.
Ajustamos contra el teorico
vt_tendencia_v1_lin = fit.variogram(v_tendencia_v1, vgm(60, "Lin", 15, 15))
singular model in variogram fit
plot(v_tendencia_v1 , vt_tendencia_v1_lin)
vt_tendencia_v1_sph = fit.variogram(v_tendencia_v1, vgm(80, "Sph", 15, 15))
vt_tendencia_v1_sph
plot(v , vt_tendencia_v1_sph)
vt_tendencia_v1_exp = fit.variogram(v_tendencia_v1, vgm(70, "Exp", 20, 15))
plot(v_tendencia_v1 , vt_tendencia_v1_exp)
vt_tendencia_v1_mat = fit.variogram(v_tendencia_v1, vgm(50, "Mat", 20, 12))
plot(v_tendencia_v1 , vt_tendencia_v1_mat)
attr(vt_tendencia_v1_lin, 'SSErr')
[1] 18734.48
attr(vt_tendencia_v1_sph, 'SSErr')
[1] 3153.043
attr(vt_tendencia_v1_exp, 'SSErr')
[1] 2239.285
attr(vt_tendencia_v1_mat, 'SSErr')
[1] 2239.284
Probemos ahora con la tendencia negativa
v_tendencia_v2 <- variogram(MEAN_VIENTO_KMH~x-y, data_variogram)
plot(v_tendencia_v2)
v_alpha <- variogram(MEAN_VIENTO_KMH~x-y, data_variogram, alpha = c(0, 45, 90, 135))
plot(v_alpha)
v_cloud <- variogram(MEAN_VIENTO_KMH~x-y, data_variogram, cloud=TRUE)
plot(v_cloud)
# Elegimos a ojo el valor del cutoff
plot(variogram(MEAN_VIENTO_KMH~x-y, data_variogram, cutoff=20, width=30, map=T))
plot(variogram(MEAN_VIENTO_KMH~x-y, data_variogram, cutoff=10, width=5, map=T))
plot(variogram(MEAN_VIENTO_KMH~x-y, data_variogram, cutoff=40, width=2, map=T))
vt_tendencia_v2_lin = fit.variogram(v_tendencia_v2, vgm(125, "Lin", 30, 10))
singular model in variogram fit
plot(v_tendencia_v2 , vt_tendencia_v2_lin)
vt_tendencia_v2_exp = fit.variogram(v_tendencia_v2, vgm(125, "Exp", 30, 10))
plot(v_tendencia_v2 , vt_tendencia_v2_exp)
vt_tendencia_v2_sph = fit.variogram(v_tendencia_v2, vgm(90, "Sph", 30, 10))
plot(v_tendencia_v2 , vt_tendencia_v2_sph)
attr(vt_tendencia_v2_lin, 'SSErr')
[1] 7971.275
attr(vt_tendencia_v2_exp, 'SSErr')
[1] 3398.701
attr(vt_tendencia_v2_sph, 'SSErr')
[1] 4203.725
#Kriging
Armamos la grilla
data_kriging <- data_variogram
departamentos <- st_read("data_departamentos/Codgeo_Pais_x_dpto_con_datos/pxdptodatosok.shp")
Reading layer `pxdptodatosok' from data source `/Users/antonellaschiavoni/Antonella/Maestria/EstadisticaEspacial/TPFinal/Datos/trabajo_final/smn/estadistica_espacial/data_departamentos/Codgeo_Pais_x_dpto_con_datos/pxdptodatosok.shp' using driver `ESRI Shapefile'
Simple feature collection with 527 features and 10 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -74.02985 ymin: -90 xmax: -25.02314 ymax: -21.74506
Geodetic CRS: WGS 84
departamentos <-departamentos[departamentos$departamen != "Antártida Argentina",]
departamentos <-departamentos[departamentos$departamen != "Islas del Atlántico Sur",]
departamentos <- as_Spatial(departamentos)
grilla <- as.data.frame(spsample(departamentos, type="regular", n=5000))
CRS object has comment, which is lost in output
names(grilla) <- c("x", "y")
coordinates(grilla) <- c("x", "y")
plot(grilla)
gridded(grilla) <- TRUE
grid has empty column/rows in dimension 2
fullgrid(grilla) <- TRUE
plot(grilla)
proj4string(grilla) <- proj4string(departamentos)
CRS object has comment, which is lost in output
proj4string(data_kriging) <- proj4string(departamentos)
CRS object has comment, which is lost in output
ko1 <- krige(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_lin, nmax=15)
[using ordinary kriging]
ko2 <- krige(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_exp, nmax=15)
[using ordinary kriging]
ko3 <- krige(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_sph, nmax=15)
[using ordinary kriging]
ko4 <- krige(MEAN_VIENTO_KMH~x+y, data_kriging, grilla, model = vt_tendencia_v1_exp, nmax=15)
[using universal kriging]
ko5 <- krige(MEAN_VIENTO_KMH~x+y, data_kriging, grilla, model = vt_tendencia_v1_mat, nmax=15)
[using universal kriging]
ko6 <- krige(MEAN_VIENTO_KMH~x-y, data_kriging, grilla, model = vt_tendencia_v2_exp, nmax=15)
[using universal kriging]
ko7 <- krige(MEAN_VIENTO_KMH~x-y, data_kriging, grilla, model = vt_tendencia_v2_sph, nmax=15)
[using universal kriging]
ko1_cv <- krige.cv(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_lin, nmax=15, nfold=10) %>% mutate(modelo = "v0_linl")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko1_cv) <- ~x+y
ko1_cv <- st_as_sf(ko1_cv)
ko2_cv <- krige.cv(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_exp, nmax=15, nfold=10) %>% mutate(modelo = "v0_exp")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko2_cv) <- ~x+y
ko2_cv <- st_as_sf(ko2_cv)
ko3_cv <- krige.cv(MEAN_VIENTO_KMH~1, data_kriging, grilla, model = vt_sph, nmax=15, nfold=10) %>% mutate(modelo = "v0_sph")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko3_cv) <- ~x+y
ko3_cv <- st_as_sf(ko3_cv)
ko4_cv <- krige.cv(MEAN_VIENTO_KMH~x+y, data_kriging, grilla, model = vt_tendencia_v1_exp, nmax=15, nfold=10) %>% mutate(modelo = "v1_exp")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko4_cv) <- ~x+y
ko4_cv <- st_as_sf(ko4_cv)
ko5_cv <- krige.cv(MEAN_VIENTO_KMH~x+y, data_kriging, grilla, model = vt_tendencia_v1_mat, nmax=15, nfold=10) %>% mutate(modelo = "v1_mat")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko5_cv) <- ~x+y
ko5_cv <- st_as_sf(ko5_cv)
ko6_cv <- krige.cv(MEAN_VIENTO_KMH~x-y, data_kriging, grilla, model = vt_tendencia_v2_exp, nmax=15, nfold=5) %>% mutate(modelo = "v2_exp")
|
| | 0%
|
|================================ | 25%
|
|================================================================ | 50%
|
|================================================================================================= | 75%
|
|=================================================================================================================================| 100%
coordinates(ko6_cv) <- ~x+y
ko6_cv <- st_as_sf(ko6_cv)
ko7_cv <- krige.cv(MEAN_VIENTO_KMH~x-y, data_kriging, grilla, model = vt_tendencia_v2_sph, nmax=15, nfold=10) %>% mutate(modelo = "v2_sph")
|
| | 0%
|
|============== | 11%
|
|============================= | 22%
|
|=========================================== | 33%
|
|========================================================= | 44%
|
|======================================================================== | 56%
|
|====================================================================================== | 67%
|
|==================================================================================================== | 78%
|
|=================================================================================================================== | 89%
|
|=================================================================================================================================| 100%
coordinates(ko7_cv) <- ~x+y
ko7_cv <- st_as_sf(ko7_cv)
pred_cv <- list(ko1_cv,ko2_cv,ko3_cv,ko4_cv,ko5_cv,ko6_cv,ko7_cv) %>% bind_rows()
resumen <- pred_cv %>% as.data.frame() %>% group_by(modelo) %>%
summarise(RMSE = sqrt(sum(residual^2)/length(residual))) %>%
arrange(RMSE)
resumen
variogram_residual <- variogram(residual~1, ko4_cv)
ggplot(variogram_residual, aes(x = dist, y = gamma)) + geom_point() + theme_bw() +
xlab("Ditancia metros") + ylim(c(0, max(variogram_residual$gamma)))
#Observamos las predicciones
ggplot() +
geom_sf(data = ko4_cv, aes(color = exp(var1.pred))) +
scale_color_viridis_c(name = "Promedio de la velocidad del viento")
ggplot() +
geom_sf(data = ko4_cv, aes(color = factor(fold))) +
facet_wrap(~factor(fold)) +
scale_color_viridis_d(name = "Folds")
#Observamos la varianza
ggplot() +
geom_sf(data = ko4_cv, aes(color = exp(var1.var))) +
scale_color_viridis_c()
ko5_cv$var1.pred <- round(ko5_cv$var1.pred,2)
#Observamos las predicciones
ggplot() +
geom_sf(data = ko5_cv, aes(color = exp(var1.pred))) +
scale_color_viridis_c()
ggplot() +
geom_sf(data = ko5_cv, aes(color = factor(fold))) +
facet_wrap(~factor(fold)) +
scale_color_viridis_d(name = "Folds")
#Observamos la varianza
ggplot() +
geom_sf(data = ko5_cv, aes(color = exp(var1.var))) +
scale_color_viridis_c()
spplot(ko4["var1.pred"])
spplot(ko4["var1.var"])
spplot(ko5["var1.pred"])
spplot(ko5["var1.var"])
library(raster)
r <- raster(ko5)
r.m <- mask(r, departamentos)
library(tmap)
tm_shape(r.m) +
tm_raster(n=10,
palette="Blues",
auto.palette.mapping=FALSE,
title="Promedio de la velocidad del viento") +
tm_legend(legend.outside=TRUE)
The argument auto.palette.mapping is deprecated. Please use midpoint for numeric data and stretch.palette for categorical data to control the palette mapping.
r <- raster(ko5, layer="var1.var")
r.m <- mask(r, departamentos)
tm_shape(r.m) +
tm_raster(n=7,
palette ="Reds",
title="Variance map ") +
tm_legend(legend.outside=TRUE)
r <- sqrt(raster(ko5, layer="var1.var")) * 1.96
r.m <- mask(r, departamentos)
tm_shape(r.m) +
tm_raster(n=7,
palette ="Reds",
title="95% CI map \n(en km/h)") +
tm_legend(legend.outside=TRUE)